require(knitr)
require(dplyr)
require(ggplot2)
require(kml)
load("Donnees/table.appr.RData")
don <- table.appr[,3:26]
On regarde l’évolution du rapport variance intra / variance totale en fonction de k
res=vector("numeric", 19)
for(k in 2:20){
kmeans.k=kmeans(don, k)
res[k-1]=kmeans.k$tot.withinss/kmeans.k$totss
}
## Warning: did not converge in 10 iterations
plot(2:20, res, type="b")
On commence par faire un k-means avec 5, 7 et 10 centres pour voir ce qui se passe
gp5 <- kmeans(don,5)
gp7 <- kmeans(don,7)
gp10 <- kmeans(don,10)
On trace les centres des classes ainsi que 10 station.jour pour chaque classe pris au hasard
table.appr$gp5_class <- gp5$cluster
table.appr$gp7_class <- gp7$cluster
table.appr$gp10_class <- gp10$cluster
par(mfrow=c(2,3))
for(i in 1:5){
plot(0:23, gp5$centers[i,], type="l", col="blue", lwd=3, xlab="", ylab="", ylim=c(0,1))
df=subset(table.appr, gp5_class == i)
ech=sample(1:nrow(df), 10)
df=df[ech,]
for(j in 1:nrow(df)){
lines(0:23, df[j,3:26], col="grey")
}
}
par(mfrow=c(2,4))
for(i in 1:7){
plot(0:23, gp7$centers[i,], type="l", col="blue", lwd=3, xlab="", ylab="", ylim=c(0,1))
df=subset(table.appr, gp7_class == i)
ech=sample(1:nrow(df), 10)
df=df[ech,]
for(j in 1:nrow(df)){
lines(0:23, df[j,3:26], col="grey")
}
}
par(mfrow=c(2,5))
for(i in 1:10){
plot(0:23, gp10$centers[i,], type="l", col="blue", lwd=3, xlab="", ylab="", ylim=c(0,1))
df=subset(table.appr, gp10_class == i)
ech=sample(1:nrow(df), 10)
df=df[ech,]
for(j in 1:nrow(df)){
lines(0:23, df[j,3:26], col="grey")
}
}
par(mfrow=c(1,1))
On essaye le package kml. Choix du nombre de classes
ech <- table.appr[sample(1:nrow(table.appr), 1000),]
donLD <- clusterLongData(traj=ech[,3:26], idAll=paste0(ech$number, " - ", ech$download_date_trunc))
kml(donLD,nbClusters=2:6,nbRedrawing=20,toPlot="criterion")
x11(type = "Xlib")
choice(donLD, typeGraph = "bmp")
On choisit 4 classes.
donLD <- clusterLongData(traj=table.appr[,3:26], idAll=paste0(table.appr$number, " - ", table.appr$download_date_trunc))
kml(donLD,nbClusters=4,nbRedrawing=20,toPlot="none")
klm4 <- donLD
save(klm4, file="Donnees/klm4.RData")
load("Donnees/klm4.RData")
klm.clusters <- getClusters(klm4, 4)
levels(klm.clusters) = seq(1, 4, 1)
table.appr$klm4 <- klm.clusters
klm4.mean <- calculTrajMean(table.appr[,3:26], klm.clusters)
par(mfrow=c(2,2))
for(i in 1:4){
plot(0:23, klm4.mean[i,], type="l", col="blue", lwd=3, xlab="", ylab="", ylim=c(0,1))
df=subset(table.appr, klm4 == i)
ech=sample(1:nrow(df), 10)
df=df[ech,]
for(j in 1:nrow(df)){
lines(0:23, df[j,3:26], col="grey")
}
}
par(mfrow=c(1,1))